'''
Created on Oct 5, 2011

@author: hp
'''

import os

def split(fname,ratio=0.1):
    
    fin = open("bnc/" + fname, 'r')
    fout1 = open("data/" + fname + ".train", 'w')
    fout2 = open("data/" + fname + ".test", 'w')
    train_set = ""
    test_set = ""
    try:
        lines = fin.read().split('\n')
        for line in lines:
            if line.find('subj') == -1 and line.find('obj') == -1:
                continue
            if train_set == "" or len(train_set.split("\n")) <= int (ratio * len(lines)):
                train_set = train_set + line + "\n"
            else:
                test_set = test_set + line + "\n"
        
        fout1.write(train_set)
        fout2.write(test_set)
        
    finally:
        fin.close()
        fout1.close()
        fout2.close()

def split_verb(verb="", ratio=0.1):
    
    for fname in os.listdir("bnc"):
        if fname.find(verb) == -1:
            continue
        if fname.find(".e.") != -1 or fname.find(".edoc.") != -1:
            continue
        if fname.find('.train') != -1 or fname.find('.test') != -1:
            continue
        print "bnc/" + fname
        split(fname, ratio)
    
if __name__ == "__main__":
    split_verb(verb="explain")